Initial Replication Study

Figure 4

First, I get the data.

immigration_data <- read_csv('google_trends_data.csv', skip=3, col_names=c('month', 'welfare_trend', 'crime_trend', 'report_trend')) |>
    mutate(month = parse_date(month, "%Y-%m")) |>
    mutate(president=as.factor(ifelse(year(month) <= 2008, "Bush", ifelse(year(month) <= 2016, "Obama", "Trump"))))
## Rows: 192 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): month
## dbl (3): welfare_trend, crime_trend, report_trend
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Now, we recreate the plot.

immigration_data |>
    pivot_longer(names_to = "trend_type", values_to = "trend_value", cols = ends_with("_trend")) |>
    mutate(trend_type=factor(trend_type, labels=c("report_trend", "crime_trend", "welfare_trend"))) |>
    ggplot(aes(x=month, y=trend_value, color=president)) +
    geom_point(alpha = 0.3) +
    geom_smooth(method=lm, formula = y ~ x, se=F) +
    facet_wrap(~ trend_type, dir="v")

## Table 3

Now we do table 3.

table_3_data <- immigration_data |>
    mutate(is_bush = president == "Bush", is_trump = president == "Trump")

models <- list(
    crime = lm(crime_trend ~ month + is_bush + is_trump, data = table_3_data),
    welfare = lm(welfare_trend ~ month + is_bush + is_trump, data = table_3_data),
    report = lm(report_trend ~ month + is_bush + is_trump, data = table_3_data)
)

msummary(models, output='markdown')
tinytable_ox62a780uyq5k8tvfbfw
crime welfare report
(Intercept) -31.109 -17.690 60.338
(13.241) (10.542) (16.691)
month 0.003 0.002 -0.001
(0.001) (0.001) (0.001)
is_bushTRUE 2.836 1.673 -0.748
(2.388) (1.901) (3.010)
is_trumpTRUE 7.348 3.798 15.219
(2.294) (1.826) (2.892)
Num.Obs. 192 192 192
R2 0.351 0.265 0.197
R2 Adj. 0.341 0.253 0.184
AIC 1345.8 1258.3 1434.7
BIC 1362.1 1274.6 1451.0
Log.Lik. -667.905 -624.134 -712.370
RMSE 7.84 6.24 9.89

We are getting different values for the model. Not only are the constants off, but they have welfare increasing by date, while we have it decreasing. They also have big positive changes from Bush. We have small positive changes from Crime and Welfare, and by Reporting, where they have a small positive change, we have a small negative change. And they find bigger changes for Trump than we do.

But, this still supports their point from this table, which was that Trump’s election led to a large positive increase in all these searches, which we do find.

table_3_data |>
    add_predictions(models[["report"]]) |>
    ggplot(aes(x=month, y=report_trend, color=president)) +
    geom_point(alpha = 0.3) +
    geom_smooth(method=lm, formula = y ~ x, se=F) +
    geom_line(aes(y=pred), linetype="dashed")

table_3_data |>
    add_predictions(models[["crime"]]) |>
    ggplot(aes(x=month, y=crime_trend, color=president)) +
    geom_point(alpha = 0.3) +
    geom_smooth(method=lm, formula = y ~ x, se=F) +
    geom_line(aes(y=pred), linetype="dashed")

table_3_data |>
    add_predictions(models[["welfare"]]) |>
    ggplot(aes(x=month, y=welfare_trend, color=president)) +
    geom_point(alpha = 0.3) +
    geom_smooth(method=lm, formula = y ~ x, se=F) +
    geom_line(aes(y=pred), linetype="dashed")

This looks very different from out original plot, but that makes sense. Before, geom_smooth was using * to have different slopes for the different presidencies. Since here, we only have a constant based on who was president, we have one slope for everything.

Figures 2 and 3

Now, we get the topic model data.

load("TopicModel.RData")
document_topics <- make.dt(immigrFit, meta = out$meta)
topic_terms <- t(exp(immigrFit$beta$logbeta[[1]]))
rownames(topic_terms) <- out$vocab
colnames(topic_terms) <- sprintf("Topic%d", 1:ncol(topic_terms))

document_topics <- document_topics |>
    mutate(date = ymd(date))

First, we make figure 2. It is about immigration coverage in general, not of a specific category, so I think all the documents there count.

campaign_start <- document_topics |>
    filter(time == "pre-election") |>
    arrange(date) |>
    slice_tail(n=1) |>
    pull(date)

campaign_end <- document_topics |>
    filter(time == "post-election") |>
    arrange(date) |>
    slice_head(n=1) |>
    pull(date)

document_topics |>
    mutate(month = floor_date(date, "month")) |>
    group_by(month, channel, time) |>
    summarize(num_segments = n()) |>
    ggplot(aes(x=month, y=num_segments, color=channel)) +
    geom_point() +
    geom_smooth(aes(group = interaction(channel, time)), se=F) +
    geom_vline(aes(xintercept=campaign_start), linetype="dashed") +
    geom_vline(aes(xintercept=campaign_end), linetype="dashed") +
    scale_color_manual(values=c("magenta", "red", "blue"))
## `summarise()` has grouped output by 'month', 'channel'. You can override using
## the `.groups` argument.
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

We have essentially the same results as they do, but with a less severe jump after the campaign start and a bigger drop before the inauguration. But their overall findings still show.

document_topics |>
    mutate(month = floor_date(date, "month")) |>
    group_by(month, channel, time) |>
    summarize(proportion_1 = sum(Topic1), proportion_3 = sum(Topic3), crime_coverage = proportion_1 + proportion_3) |>
    ggplot(aes(x=month, y=crime_coverage, color=channel)) +
    geom_point() +
    geom_smooth(aes(group = interaction(channel, time)), se=F) +
    geom_vline(aes(xintercept=campaign_start), linetype="dashed") +
    geom_vline(aes(xintercept=campaign_end), linetype="dashed") +
    scale_color_manual(values=c("magenta", "red", "blue"))
## `summarise()` has grouped output by 'month', 'channel'. You can override using
## the `.groups` argument.
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

document_topics |>
    mutate(month = floor_date(date, "month")) |>
    group_by(month, channel, time) |>
    summarize(welfare_coverage = sum(Topic13)) |>
    ggplot(aes(x=month, y=welfare_coverage, color=channel)) +
    geom_point() +
    geom_smooth(aes(group = interaction(channel, time)), se=F) +
    geom_vline(aes(xintercept=campaign_start), linetype="dashed") +
    geom_vline(aes(xintercept=campaign_end), linetype="dashed") +
    scale_color_manual(values=c("magenta", "red", "blue"))
## `summarise()` has grouped output by 'month', 'channel'. You can override using
## the `.groups` argument.
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

This time, we get identical results. I suspect that in the original, we only differ because of how we deal with the months bifrucated by the dotted lines. We were consistent, always treating them as multiple points, causing the lines to cross the dotted lines. They only seem to have done that for the second one for some reason.

Table 4 Monthly

Now we reproduce table 4. This is before we get the daily data, so we do it on a monthly level.

topic_table <- document_topics |>
    mutate(month = floor_date(date, "month")) |>
    group_by(month, channel, time) |>
    summarize(proportion_1 = sum(Topic1), proportion_3 = sum(Topic3), crime_coverage = proportion_1 + proportion_3, welfare_coverage = sum(Topic13), num_segments = n()) |>
    mutate(trump_admin = time == "post-election", month_of_year=month(month, label=T)) |>
    inner_join(immigration_data, by="month")
## `summarise()` has grouped output by 'month', 'channel'. You can override using
## the `.groups` argument.
lm(report_trend ~ num_segments + crime_coverage + welfare_coverage + trump_admin + month + month_of_year, topic_table) |>
    msummary(output='markdown')
tinytable_sqxokauo2s89nujviel7
(1)
(Intercept) 25.700
(31.861)
num_segments 0.015
(0.004)
crime_coverage 0.010
(0.050)
welfare_coverage -0.036
(0.143)
trump_adminTRUE 14.941
(2.210)
month 0.001
(0.002)
month_of_year.L -12.171
(2.006)
month_of_year.Q 9.143
(1.911)
month_of_year.C 0.275
(1.955)
month_of_year^4 1.986
(1.958)
month_of_year^5 2.206
(1.982)
month_of_year^6 -4.164
(1.991)
month_of_year^7 2.492
(1.995)
month_of_year^8 0.790
(1.972)
month_of_year^9 -0.473
(1.937)
month_of_year^10 -3.109
(1.911)
month_of_year^11 1.139
(1.891)
Num.Obs. 202
R2 0.677
R2 Adj. 0.649
AIC 1424.9
BIC 1484.4
Log.Lik. -694.433
RMSE 7.53

Obviously, this is different from their result, because it is on a monthly level. But we found significantly lower effects of number of segments and crime coverage. These things shouldn’t change, but perhaps the effect only exists in the short-term (but if so, that changes the paper’s conclusions). Though we did still have a high modifier for Trump’s presidency, persumably because that was a long-term effect.

Figure 4 and Table 3 Using Replication Files

We want to recreate figure 4 and table 3 using the data they used to see if we still get the same results.

report_data <- read_csv('from_replication_files/google_trends_report.csv') |>
    rename(report_trend = search)
## Rows: 190 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): month
## dbl (2): year, search
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
crime_data <- read_csv('from_replication_files/google_trends_crime.csv') |>
    rename(crime_trend = search)
## Rows: 190 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): month
## dbl (2): year, search
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
welfare_data <- read_csv('from_replication_files/google_trends_welfare.csv') |>
    rename(welfare_trend = search)
## Rows: 190 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): month
## dbl (2): year, search
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
trend_data_given <- report_data |>
    inner_join(crime_data, by=c("year", "month")) |>
    inner_join(welfare_data, by=c("year", "month")) |>
    mutate(month = paste(year, month), month = parse_date(month, "%Y %m")) |>
    mutate(president=as.factor(ifelse(year(month) <= 2008, "Bush", ifelse(year(month) <= 2016, "Obama", "Trump"))))

trend_data_given |>
    pivot_longer(names_to = "trend_type", values_to = "trend_value", cols = ends_with("_trend")) |>
    mutate(trend_type=factor(trend_type, labels=c("report_trend", "crime_trend", "welfare_trend"))) |>
    ggplot(aes(x=month, y=trend_value, color=president)) +
    geom_point(alpha = 0.3) +
    geom_smooth(method=lm, formula = y ~ x, se=F) +
    facet_wrap(~ trend_type, dir="v")

table_3_data_given <- trend_data_given |>
    mutate(is_bush = president == "Bush", is_trump = president == "Trump")

list(
    crime = lm(crime_trend ~ month + is_bush + is_trump, data = table_3_data_given),
    welfare = lm(welfare_trend ~ month + is_bush + is_trump, data = table_3_data_given),
    report = lm(report_trend ~ month + is_bush + is_trump, data = table_3_data_given)
) |> msummary(output='markdown')
tinytable_l7ofgz5jbnn2lkwhd2mr
crime welfare report
(Intercept) -4.163 28.585 72.574
(23.241) (20.710) (16.089)
month 0.001 0.000 -0.002
(0.001) (0.001) (0.001)
is_bushTRUE 8.280 7.819 0.785
(4.187) (3.731) (2.899)
is_trumpTRUE 19.903 18.560 19.716
(4.027) (3.588) (2.788)
Num.Obs. 190 190 190
R2 0.262 0.237 0.286
R2 Adj. 0.250 0.225 0.275
AIC 1544.4 1500.6 1404.7
BIC 1560.7 1516.8 1420.9
Log.Lik. -767.216 -745.305 -697.339
RMSE 13.72 12.23 9.50

With this, we get the same data as he does. We still don’t get quite the same regression, but we get closer. Interestingly, a lot of times, they have non-zero trends when we have zero trends.

But it’s also true that he downloaded this in 3 separate files while we did it in a single file. Since Google measures things relative to each other when you download them together, it’s possible that this changes things.

Table 4 Using Replication Daily Data

Now we will replicate Table 4 using the given day values.

given_daily_data <- read_csv('from_replication_files/gt_report_daily.csv')
## New names:
## Rows: 5751 Columns: 4
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," dbl
## (3): ...1, search, search_adj date (1): date
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
topic_table_daily <- document_topics |>
    group_by(date, channel, time) |>
    summarize(proportion_1 = sum(Topic1), proportion_3 = sum(Topic3), crime_coverage = proportion_1 + proportion_3, welfare_coverage = sum(Topic13), num_segments = n()) |>
    mutate(trump_admin = time == "post-election", month_of_year=month(date, label=T), day_of_week=wday(date, label=T)) |>
    inner_join(given_daily_data, by="date")
## `summarise()` has grouped output by 'date', 'channel'. You can override using
## the `.groups` argument.
list(
    search_model=lm(search ~ num_segments + crime_coverage + welfare_coverage + trump_admin + date + day_of_week + month_of_year, topic_table_daily),
    search_adj_model=lm(search_adj ~ num_segments + crime_coverage + welfare_coverage + trump_admin + date + day_of_week + month_of_year, topic_table_daily)
)|>
    msummary(output='markdown')
tinytable_ae6mh6bzz99kts2qodjf
search_model search_adj_model
(Intercept) 117.614 93.523
(15.362) (15.953)
num_segments 0.141 0.203
(0.026) (0.027)
crime_coverage -0.431 0.277
(0.313) (0.325)
welfare_coverage 0.954 0.934
(0.783) (0.814)
trump_adminTRUE 9.146 20.280
(1.048) (1.088)
date -0.005 -0.003
(0.001) (0.001)
day_of_week.L 1.176 1.579
(0.692) (0.719)
day_of_week.Q -7.476 -7.231
(0.694) (0.720)
day_of_week.C -0.020 -0.447
(0.690) (0.716)
day_of_week^4 -0.689 -0.952
(0.688) (0.714)
day_of_week^5 3.147 2.744
(0.688) (0.714)
day_of_week^6 -1.341 -1.567
(0.686) (0.712)
month_of_year.L 6.419 -11.125
(0.941) (0.977)
month_of_year.Q 3.229 8.303
(0.902) (0.937)
month_of_year.C 1.407 -2.957
(0.923) (0.958)
month_of_year^4 4.613 4.193
(0.918) (0.953)
month_of_year^5 0.408 2.869
(0.910) (0.945)
month_of_year^6 -4.469 -6.073
(0.920) (0.955)
month_of_year^7 2.725 1.715
(0.913) (0.948)
month_of_year^8 -2.054 0.035
(0.916) (0.951)
month_of_year^9 2.672 -1.323
(0.917) (0.953)
month_of_year^10 -3.272 -1.169
(0.896) (0.931)
month_of_year^11 0.567 1.562
(0.878) (0.912)
Num.Obs. 5053 5053
R2 0.089 0.268
R2 Adj. 0.085 0.265
AIC 43850.3 44231.8
BIC 44006.9 44388.4
Log.Lik. -21901.136 -22091.890
RMSE 18.46 19.17

From here, we see that while neither of them are exactly identical, the one using the adjusted values is much closer to what they got. But the non-adjusted version actually goes against it, with a greater number of searches with less crime coverage.

Looking at the data, it seems that the problem is that they can only look at one time period (say, month) at a time, and each month is only scaled with other searches in that month. But, I don’t understand how their normalization works.

Figure 4 and Table 3 Using Individually Downloaded Data

We noticed that they got each prompt separately. Since Google Trends weights everything you are comparing to the max of the other, if we want to replicate their results, then we need to get them separately like what the researchers seem to have done.

report_data <- read_csv('from_google_trends/report_trend_individual.csv', skip=3, col_names=c('month', 'report_trend'))
## Rows: 192 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): month
## dbl (1): report_trend
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
crime_data <- read_csv('from_google_trends/crime_trend_individual.csv', skip=3, col_names=c('month', 'crime_trend'))
## Rows: 192 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): month
## dbl (1): crime_trend
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
welfare_data <- read_csv('from_google_trends/welfare_trend_individual.csv', skip=3, col_names=c('month', 'welfare_trend'))
## Rows: 192 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): month
## dbl (1): welfare_trend
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
trend_data_individual <- report_data |>
    inner_join(crime_data, by=c("month")) |>
    inner_join(welfare_data, by=c("month")) |>
    mutate(month = parse_date(month, "%Y-%m")) |>
    mutate(president=as.factor(ifelse(year(month) <= 2008, "Bush", ifelse(year(month) <= 2016, "Obama", "Trump"))))

trend_data_individual |>
    pivot_longer(names_to = "trend_type", values_to = "trend_value", cols = ends_with("_trend")) |>
    mutate(trend_type=factor(trend_type, labels=c("report_trend", "crime_trend", "welfare_trend"))) |>
    ggplot(aes(x=month, y=trend_value, color=president)) +
    geom_point(alpha = 0.3) +
    geom_smooth(method=lm, formula = y ~ x, se=F) +
    facet_wrap(~ trend_type, dir="v")

table_3_data_individual <- trend_data_individual |>
    mutate(is_bush = president == "Bush", is_trump = president == "Trump")

list(
    crime = lm(crime_trend ~ month + is_bush + is_trump, data = table_3_data_individual),
    welfare = lm(welfare_trend ~ month + is_bush + is_trump, data = table_3_data_individual),
    report = lm(report_trend ~ month + is_bush + is_trump, data = table_3_data_individual)
) |> msummary(output='markdown')
tinytable_aoknccietzez8udx6t4f
crime welfare report
(Intercept) -57.817 -20.711 68.858
(23.105) (27.392) (17.014)
month 0.005 0.003 -0.002
(0.001) (0.002) (0.001)
is_bushTRUE 6.951 4.410 -1.618
(4.166) (4.939) (3.068)
is_trumpTRUE 12.422 12.235 16.077
(4.003) (4.746) (2.948)
Num.Obs. 192 192 192
R2 0.339 0.190 0.190
R2 Adj. 0.329 0.177 0.178
AIC 1559.6 1625.0 1442.1
BIC 1575.9 1641.2 1458.4
Log.Lik. -774.798 -807.478 -716.049
RMSE 13.69 16.23 10.08

But the paper still has welfare searches going up in Trump’s presidency, while we have it going down slightly. And in the table, they have welfare going down over time, while we have it going up. So, this discrepency doesn’t explain anything.

Extension